from google.colab import drive
drive.mount('/content/gdrive')
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import dask.dataframe as dd
import plotly.express as px
import datetime as dt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
import matplotlib.colors
# file_directory = "/content/gdrive/My Drive/Data Mining/unzipped/"
file_directory = ""
df_tariff = pd.read_csv(os.path.join(file_directory,"Tariffs.csv"))
df_tariff.head()
df_tariff['TariffDateTime'] = pd.to_datetime(df_tariff['TariffDateTime'])
# df_tariff["Day"] = df_tariff["TariffDateTime"].apply(lambda x: x.day)
# df_tariff["Month"] = df_tariff["TariffDateTime"].apply(lambda x: x.month)
# df_tariff["Hour"] = df_tariff["TariffDateTime"].apply(lambda x: x.hour)
# df_tariff["minute"] = df_tariff["TariffDateTime"].apply(lambda x: x.minute)
# df_tariff.head()
df_tariff['TariffDateTime'] = df_tariff['TariffDateTime'].apply(lambda x: str(x.minute) + '-' + str(x.hour) + '-' + str(x.day) + '-' + str(x.month))
df_tariff.set_index('TariffDateTime', inplace=True)
df_tariff.head()
df_tariff
df = pd.read_pickle("./completedata.pkl")
df['DateTime'] = pd.to_datetime(df['DateTime'])
df2 = pd.read_pickle("./dynamicdatapickle.pkl")
df2['DateTime'] = pd.to_datetime(df2['DateTime'])
def fetchtype(x):
key = str(x.minute) + '-' + str(x.hour) + '-' + str(x.day) + '-' + str(x.month)
try:
val_df = df_tariff.loc[key,'Tariff']
return val_df
except KeyError:
return np.nan
df2["Std type"] = df2["DateTime"].apply(fetchtype)
df.head()
df2.head()
groupby = df.groupby(['LCLid',pd.Grouper(key="DateTime",freq='M'),"Acorn Grouped"])["KWH/hh"].mean()
groupby= groupby.reset_index()
groupby.head()
groupby["month"] = groupby["DateTime"].dt.month
groupby = groupby.groupby(['LCLid',"month","Acorn Grouped"])["KWH/hh"].mean()
groupby = groupby.reset_index()
groupby.head()
len(groupby)
groupby["month"].unique()
unique_user = groupby["LCLid"].unique()
new_df_list = []
for user in unique_user:
user_detail = groupby[groupby["LCLid"]==user]
weather = ["Su","W","Sp","A"]
summer = 0
winter = 0
spring = 0
autumn = 0
acorn = user_detail["Acorn Grouped"].unique()[0]
for w in weather:
if (w == "Su"):
val1 = user_detail[user_detail["month"]==6]["KWH/hh"].values
val2 = user_detail[user_detail["month"]==7]["KWH/hh"].values
val3 = user_detail[user_detail["month"]==8]["KWH/hh"].values
if len(val1) == 0:
val1 = [0]
if len(val2) == 0:
val2 = [0]
if len(val3) == 0:
val3 = [0]
summer = val1[0] + val2[0] + val3[0]
elif (w == "W"):
val1 = user_detail[user_detail["month"]==12]["KWH/hh"].values
val2 = user_detail[user_detail["month"]==1]["KWH/hh"].values
val3 = user_detail[user_detail["month"]==2]["KWH/hh"].values
if len(val1) == 0:
val1 = [0]
if len(val2) == 0:
val2 = [0]
if len(val3) == 0:
val3 = [0]
winter = val1[0] + val2[0] + val3[0]
elif (w == "Sp"):
val1 = user_detail[user_detail["month"]==3]["KWH/hh"].values
val2 = user_detail[user_detail["month"]==4]["KWH/hh"].values
val3 = user_detail[user_detail["month"]==5]["KWH/hh"].values
if len(val1) == 0:
val1 = [0]
if len(val2) == 0:
val2 = [0]
if len(val3) == 0:
val3 = [0]
spring = val1[0] + val2[0] + val3[0]
else:
val1 = user_detail[user_detail["month"]==9]["KWH/hh"].values
val2 = user_detail[user_detail["month"]==10]["KWH/hh"].values
val3 = user_detail[user_detail["month"]==11]["KWH/hh"].values
if len(val1) == 0:
val1 = [0]
if len(val2) == 0:
val2 = [0]
if len(val3) == 0:
val3 = [0]
autumn = val1[0] + val2[0] + val3[0]
data = {"LCLid":[user],"Summer":[summer],"Winter":[winter],"Spring":[spring],"Autumn":[autumn], "Acorn_group":[acorn]}
new_df = pd.DataFrame(data)
new_df_list.append(new_df)
del new_df
new_df = pd.concat(new_df_list)
len(new_df)
new_df.head()
standard_cus = new_df.copy()
new_df2 = new_df[["Summer","Winter","Spring","Autumn","Acorn_group"]]
new_df2
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
new_df2["Acorn_group"] = le.fit_transform(new_df2["Acorn_group"])
new_df2.head()
from sklearn import preprocessing
from sklearn.cluster import KMeans
X = new_df2.values
W = preprocessing.normalize(X)
cluster = KMeans(n_clusters=2, n_init=12)
cluster = cluster.fit(W)
labels = cluster.labels_
print (labels)
new_df["cluster"] = labels
new_df.head()
cluster_type = [0,1]
y_axis = []
for c in cluster_type:
c_data = new_df[new_df["cluster"]==c]
display(c_data.head())
su = c_data[["Summer","Winter","Spring","Autumn"]].mean()["Summer"]
sp = c_data[["Summer","Winter","Spring","Autumn"]].mean()["Spring"]
a = c_data[["Summer","Winter","Spring","Autumn"]].mean()["Autumn"]
w = c_data[["Summer","Winter","Spring","Autumn"]].mean()["Winter"]
new_data = [w,sp,su,a]
y_axis.append(new_data)
import matplotlib.pyplot as plt
plt.plot(["Winter", "Spring", "Summer", "Autumn"],y_axis[0], label="cluster 1")
plt.plot(["Winter", "Spring", "Summer", "Autumn"],y_axis[1], label="cluster 2")
# plt.plot(["Winter", "Spring", "Summer", "Autumn"],y_axis[2])
plt.xlabel("Seasons")
plt.ylabel("Average KWH usage")
plt.title("Visualization of cluster of standard user with season")
plt.show()
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
sillhoute_scores = []
n_cluster = [2,3,4,5,6,7,8,9,10,11,12,13,14,15]
X = new_df[["Summer","Winter","Spring","Autumn"]].values.copy()
# Very important to scale!
sc = MinMaxScaler()
X = sc.fit_transform(X)
for n_cluster in n_cluster:
kmeans = KMeans(n_clusters=n_cluster)
cluster_found = kmeans.fit_predict(X)
sillhoute_scores.append(silhouette_score(X, kmeans.labels_))
plt.plot(sillhoute_scores)
plt.xlabel("Number of cluster")
plt.ylabel("sillhoute score")
plt.title("Finding ideal number of clusters for standard user")
groupby = df2.groupby(['LCLid',pd.Grouper(key="DateTime",freq='M'),"Acorn Grouped"])["KWH/hh"].mean()
groupby= groupby.reset_index()
groupby.head()
groupby["month"] = groupby["DateTime"].dt.month
groupby.head()
groupby = groupby.groupby(['LCLid',"month","Acorn Grouped"])["KWH/hh"].mean()
groupby = groupby.reset_index()
groupby.head()
len(groupby)
groupby["month"].unique()
unique_user = groupby["LCLid"].unique()
new_df_list = []
for user in unique_user:
user_detail = groupby[groupby["LCLid"]==user]
weather = ["Su","W","Sp","A"]
summer = 0
winter = 0
spring = 0
autumn = 0
acorn = user_detail["Acorn Grouped"].unique()[0]
for w in weather:
if (w == "Su"):
val1 = user_detail[user_detail["month"]==6]["KWH/hh"].values
val2 = user_detail[user_detail["month"]==7]["KWH/hh"].values
val3 = user_detail[user_detail["month"]==8]["KWH/hh"].values
if len(val1) == 0:
val1 = [0]
if len(val2) == 0:
val2 = [0]
if len(val3) == 0:
val3 = [0]
summer = val1[0] + val2[0] + val3[0]
elif (w == "W"):
val1 = user_detail[user_detail["month"]==12]["KWH/hh"].values
val2 = user_detail[user_detail["month"]==1]["KWH/hh"].values
val3 = user_detail[user_detail["month"]==2]["KWH/hh"].values
if len(val1) == 0:
val1 = [0]
if len(val2) == 0:
val2 = [0]
if len(val3) == 0:
val3 = [0]
winter = val1[0] + val2[0] + val3[0]
elif (w == "Sp"):
val1 = user_detail[user_detail["month"]==3]["KWH/hh"].values
val2 = user_detail[user_detail["month"]==4]["KWH/hh"].values
val3 = user_detail[user_detail["month"]==5]["KWH/hh"].values
if len(val1) == 0:
val1 = [0]
if len(val2) == 0:
val2 = [0]
if len(val3) == 0:
val3 = [0]
spring = val1[0] + val2[0] + val3[0]
else:
val1 = user_detail[user_detail["month"]==9]["KWH/hh"].values
val2 = user_detail[user_detail["month"]==10]["KWH/hh"].values
val3 = user_detail[user_detail["month"]==11]["KWH/hh"].values
if len(val1) == 0:
val1 = [0]
if len(val2) == 0:
val2 = [0]
if len(val3) == 0:
val3 = [0]
autumn = val1[0] + val2[0] + val3[0]
data = {"LCLid":[user],"Summer":[summer],"Winter":[winter],"Spring":[spring],"Autumn":[autumn], "Acorn_group":[acorn]}
new_df = pd.DataFrame(data)
new_df_list.append(new_df)
del new_df
new_df = pd.concat(new_df_list)
len(new_df)
new_df.head()
dynamic_cust = new_df.copy()
new_df.describe()
new_df.isnull().sum(axis = 0)
new_df2 = new_df[["Summer","Winter","Spring","Autumn","Acorn_group"]]
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
new_df2["Acorn_group"] = le.fit_transform(new_df2["Acorn_group"])
from sklearn import preprocessing
from sklearn.cluster import KMeans
X = new_df2.values
W = preprocessing.normalize(X)
cluster = KMeans(n_clusters=2, n_init=12)
cluster = cluster.fit(W)
labels = cluster.labels_
print (labels)
new_df["cluster"] = labels
new_df.head()
cluster_type = [0,1]
y_axis = []
for c in cluster_type:
c_data = new_df[new_df["cluster"]==c]
display(c_data.head())
su = c_data[["Summer","Winter","Spring","Autumn"]].mean()["Summer"]
sp = c_data[["Summer","Winter","Spring","Autumn"]].mean()["Spring"]
a = c_data[["Summer","Winter","Spring","Autumn"]].mean()["Autumn"]
w = c_data[["Summer","Winter","Spring","Autumn"]].mean()["Winter"]
new_data = [w,sp,su,a]
y_axis.append(new_data)
import matplotlib.pyplot as plt
plt.plot(["Winter", "Spring", "Summer", "Autumn"],y_axis[0])
plt.plot(["Winter", "Spring", "Summer", "Autumn"],y_axis[1])
# plt.plot(["Winter", "Spring", "Summer", "Autumn"],y_axis[2])
plt.xlabel("Seasons")
plt.ylabel("Average KWH usage")
plt.title("Visualization of cluster of dynamic user with season")
plt.show()
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
sillhoute_scores = []
n_cluster = [2,3,4,5,6,7,8,9,10,11,12,13,14,15]
X = new_df[["Summer","Winter","Spring","Autumn"]].values.copy()
# Very important to scale!
sc = MinMaxScaler()
X = sc.fit_transform(X)
for n_cluster in n_cluster:
kmeans = KMeans(n_clusters=n_cluster)
cluster_found = kmeans.fit_predict(X)
sillhoute_scores.append(silhouette_score(X, kmeans.labels_))
plt.plot(sillhoute_scores)
plt.xlabel("Number of cluster")
plt.ylabel("sillhoute score")
plt.title("Finding ideal number of clusters for dynamic user")
def find_plot_silhoutte(foo_df, foo_title):
cols_to_use = ['LCLid', 'DateTime', 'KWH/hh']
temp_df = foo_df[cols_to_use]
temp_df["DateTime"] = temp_df["DateTime"].dt.hour
groupby = temp_df.groupby(['LCLid', 'DateTime'])["KWH/hh"].mean()
groupby = groupby.reset_index()
groupby = groupby.drop(['LCLid'], axis=1)
X = groupby.values
sillhoute_scores = []
n_cluster_list = np.arange(2,25).astype(int) # Max since max hours are 25-1
# Very important to scale!
sc = MinMaxScaler()
X = sc.fit_transform(X)
for i in tqdm(n_cluster_list):
kmeans = KMeans(n_clusters=i)
cluster_found = kmeans.fit_predict(X)
sillhoute_scores.append(silhouette_score(X, kmeans.labels_))
fig = px.line(sillhoute_scores, y=sillhoute_scores, x=np.arange(2,25))
fig.update_layout(
title_text='Silhoutte Score for {} Groups'.format(foo_title),
xaxis_title = 'Number of Clusters',
yaxis_title = 'Sillhoute Score',
xaxis = dict(
tickmode = 'array',
tickvals = n_cluster_list,
ticktext = n_cluster_list
)
)
fig.show()
def cluster_hourly_usage(foo_df, foo_title):
cols_to_use = ['LCLid', 'DateTime', 'KWH/hh']
temp_df = foo_df[cols_to_use]
temp_df["DateTime"] = temp_df["DateTime"].dt.hour
groupby = temp_df.groupby(['LCLid', 'DateTime'])["KWH/hh"].mean()
groupby = groupby.reset_index()
fig = px.line(groupby, x='DateTime', y='KWH/hh', color='LCLid')
fig.update_layout(
title_text = 'Mean Hourly Usage for {} Groups'.format(foo_title),
yaxis_title = "KWH/hh",
xaxis_title = "Hour of the Day",
showlegend=False,
xaxis = dict(
tickmode = 'array',
tickvals = np.arange(24),
ticktext = np.arange(24)
)
)
fig.show()
groupby = groupby.drop(['LCLid'], axis=1)
X = groupby.values
kmeans = KMeans(n_clusters=3)
cluster_found = kmeans.fit_predict(X)
groupby['cluster'] = cluster_found
cluster_avg = groupby.groupby(['cluster', 'DateTime'])["KWH/hh"].mean()
cluster_avg = cluster_avg.reset_index()
fig = px.line(cluster_avg, x="DateTime", y="KWH/hh", color="cluster")
fig.update_layout(
title_text = '{} Groups Clustering by Hour of the Day'.format(foo_title),
xaxis_title = "Hour of the Day",
xaxis = dict(
tickmode = 'array',
tickvals = np.arange(24),
ticktext = np.arange(24)
)
)
fig.show()
find_plot_silhoutte(df, 'STD')
cluster_hourly_usage(df, 'STD')
find_plot_silhoutte(df2, 'DToU')
cluster_hourly_usage(df2, 'DToU')
standard_cus.head()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
standard_cus["Acorn_group"] = le.fit_transform(standard_cus["Acorn_group"])
standard_cus.head()
from sklearn.ensemble import IsolationForest
isforest = IsolationForest(max_samples="auto")
labels = isforest.fit_predict(standard_cus[["Summer","Winter","Spring","Autumn","Acorn_group"]])
labels
len(labels)
standard_cus["outlier"] = labels
standard_cus["outlier"].value_counts()
standard_cus[standard_cus["outlier"]==-1]
standard_cus.plot.scatter('Summer', 'Spring', c='outlier', colormap='gist_rainbow', title="Standard customer Summer vs spring")
standard_cus.plot.scatter('Summer', 'Winter', c='outlier', colormap='gist_rainbow', title="Standard customer Summer vs winter")
standard_cus.plot.scatter('Summer', 'Autumn', c='outlier', colormap='gist_rainbow', title="Standard customer Summer vs autumn")
dynamic_cust.head()
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
dynamic_cust["Acorn_group"] = le.fit_transform(dynamic_cust["Acorn_group"])
from sklearn.ensemble import IsolationForest
isforest = IsolationForest(max_samples="auto")
labels = isforest.fit_predict(dynamic_cust[["Summer","Winter","Spring","Autumn","Acorn_group"]])
labels
dynamic_cust["outlier"] = labels
dynamic_cust["outlier"].value_counts()
dynamic_cust.plot.scatter('Summer', 'Autumn', c='outlier', colormap='gist_rainbow',title="dynamic customer Summer vs autumn")
dynamic_cust.plot.scatter('Summer', 'Winter', c='outlier', colormap='gist_rainbow', title="dynamic customer Summer vs winter")
dynamic_cust.plot.scatter('Summer', 'Spring', c='outlier', colormap='gist_rainbow',title="dynamic customer Summer vs spring")